if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager")
if (!requireNamespace("GEOmetadb", quietly = TRUE)) BiocManager::install("GEOmetadb")
if (!requireNamespace("edgeR", quietly = TRUE)) BiocManager::install("edgeR")
if (!requireNamespace("biomaRt", quietly = TRUE)) BiocManager::install("biomaRt")
if (!requireNamespace("tidyverse", quietly = TRUE)) install.packages("tidyverse")
if (!requireNamespace("gdtools", quietly = TRUE)) install.packages("gdtools")
if (!requireNamespace("kableExtra", quietly = TRUE)) install.packages("kableExtra")
if (!requireNamespace("data.table", quietly = TRUE)) install.packages("data.table")
if (!requireNamespace("ComplexHeatmap", quietly = TRUE)) BiocManager::install("ComplexHeatmap")
library(knitr)
library(GEOmetadb)
library(edgeR)
library(biomaRt)
library(dplyr)
library(kableExtra)
library(data.table)
library(ComplexHeatmap)
library(circlize)
Study: SARS-CoV-2 Infected Cardiomyocytes Recruit Monocytes by Secreting CCL2(Chen et al. 2020)
GEO dataset: GSE151879
The aim of the sudy is to provide evidence that SARS-CoV-2 infects human cardiomyocytes.
In this report I am preforming differential gene expression analysis and preliminary over-representation analysis.
In A1, I chose my data set, downloaded it using GEOmetadb, cleaned it and normalized it.
In this section, there is a brief summary of the steps I went through from getting the data set to obtaining the normalized data.
sub_dir <- "GSE151879"
p <-file.path(getwd(), sub_dir)
#if the data exists no need to download it again
if (dir.exists(p)){
setwd(p)
Adult_path <- "GSE151879_raw_counts_genes.Adult_human_cardiomyocytes.txt.gz"
Macrophages_path <- "GSE151879_raw_counts_genes.Macrophages.txt.gz"
hESC_path <- "GSE151879_raw_counts_genes.hESC-derived_cardiomyocytes.txt.gz"
Adult_human_CM = read.delim(Adult_path,header=TRUE,check.names = FALSE)
Macrophages = read.delim(Macrophages_path,header=TRUE,check.names = FALSE)
hESC_CM = read.delim (hESC_path,header=TRUE,check.names = FALSE)
} else{
sfiles = getGEOSuppFiles("GSE151879")
fnames = rownames(sfiles)
# there are three supplemental files, I chose to work with Adult_human_CM
Adult_human_CM = read.delim(fnames[1],header=TRUE,check.names = FALSE)
Macrophages = read.delim(fnames[2],header=TRUE,check.names = FALSE)
hESC_CM = read.delim(fnames[3],header=TRUE,check.names = FALSE)
}
#translate out counts into counts per million using the edgeR package
cpms = cpm(Adult_human_CM[,2:7])
rownames(cpms) <- Adult_human_CM[,1]
# get rid of low counts
keep = rowSums(cpms >1) >=3
Adult_human_CM_filtered = Adult_human_CM[keep,]
samples <- data.frame(lapply(colnames(Adult_human_CM_filtered)[2:7],
FUN=function(x){unlist(strsplit(x, split = "\\_"))[c(4,5)]}))
colnames(samples) <- colnames(Adult_human_CM_filtered)[2:7]
rownames(samples) <- c("condition","number")
samples <- data.frame(t(samples))
ensembl <- useMart("ensembl")
ensembl = useDataset("hsapiens_gene_ensembl",mart=ensembl)
conversion_stash <- "Adult_human_CM_id_conversion.rds"
if (file.exists(conversion_stash)) {
Adult_human_CM_id_conversion<- readRDS(conversion_stash)
} else {
Adult_human_CM_id_conversion <- getBM(attributes = c("ensembl_gene_id","hgnc_symbol"),
filters = c("ensembl_gene_id"),
values = Adult_human_CM_filtered$gene_id,
mart = ensembl)
saveRDS(Adult_human_CM_id_conversion, conversion_stash)
}
# merge mapped data frame with original filtered data frame
colnames(Adult_human_CM_id_conversion) <- c("gene_id", "hgnc_symbol")
data_table_1 = data.table(Adult_human_CM_filtered, key="gene_id")
data_table_2 = data.table(Adult_human_CM_id_conversion, key="gene_id")
dt.merged <- merge(data_table_1, data_table_2, all = T)
#reorder data frame
dt.merged <- dt.merged[, c(1,8,2,3,4,5,6,7)]
# Create our DGEList object to be used by edgeR
filtered_data_matrix <- as.matrix(dt.merged[,3:8])
rownames(filtered_data_matrix) <- dt.merged$gene_id
d = DGEList(counts=filtered_data_matrix, group =samples$condition )
# Calculate the normalization factors
d = calcNormFactors(d)
normalized_counts <- cpm(d)
# Merge the gene names with normalized data
normalized_count_data = data.table(normalized_counts)
normalized_count_data$gene_id <- dt.merged$gene_id
normalized_count_data$hgnc_symbol <- dt.merged$hgnc_symbol
normalized_count_data <- normalized_count_data[, c(7,8,1,2,3,4,5,6)]
# Take a look at normalized data
kable(normalized_count_data[1:5,1:8], type="html")
| gene_id | hgnc_symbol | Adult_human_cardiomyocytes_Mock_1 | Adult_human_cardiomyocytes_Mock_2 | Adult_human_cardiomyocytes_Mock_3 | Adult_human_cardiomyocytes_SARS-CoV2_1 | Adult_human_cardiomyocytes_SARS-CoV2_2 | Adult_human_cardiomyocytes_SARS-CoV2_3 |
|---|---|---|---|---|---|---|---|
| ENSG00000000003 | TSPAN6 | 20.74612 | 18.886264 | 20.98601 | 23.418275 | 21.334108 | 25.215002 |
| ENSG00000000419 | DPM1 | 31.52837 | 31.866359 | 36.76288 | 33.195913 | 35.429858 | 38.663004 |
| ENSG00000000457 | SCYL3 | 12.31673 | 12.216560 | 12.42508 | 13.973335 | 13.734834 | 15.129001 |
| ENSG00000000460 | C1orf112 | 2.61884 | 2.649916 | 2.62592 | 4.399013 | 4.451289 | 4.465157 |
| ENSG00000000971 | CFH | 396.28362 | 364.902374 | 409.08841 | 385.338754 | 345.456151 | 376.036234 |
MDS plot represents the distances between samples. I can see a clear distinction between the two conditions.
hold <- vector()
for (i in 1:nrow(samples)) {
lab <-c(samples$condition[i],samples$number[i])
hold<- c(hold, paste(lab, collapse = "_") )
}
plotMDS(d, labels= hold,
col = c("darkgreen","blue")[factor(samples$condition)], main = "MDS Plot")
# Use SARS-CoV2 condition for model
model_design <- model.matrix(~samples$condition)
d <- estimateDisp(d, model_design)
plotBCV(d,col.tagwise = "black",col.common = "red", main = "BCV Plot")
plotMeanVar(d, show.raw.vars = TRUE, show.tagwise.vars=TRUE,
show.ave.raw.vars = TRUE,
NBline=TRUE,
show.binned.common.disp.vars = TRUE, main = "Mean-Variance Plot")
# fit the model
fit <- glmQLFit(d, model_design)
#calculate differential expression
qlf.pos_vs_neg <- glmQLFTest(fit, coef='samples$conditionSARS-CoV2')
qlf_output_hits <- topTags(qlf.pos_vs_neg,sort.by = "PValue",
n = nrow(normalized_count_data))
kable(topTags(qlf.pos_vs_neg), type="html",row.names = FALSE, digits = 32)
|
|
|
|
# number of genes passed the threshold p-value < 0.05
length(which(qlf_output_hits$table$PValue < 0.05))
## [1] 7617
# number of genes passed correction
length(which(qlf_output_hits$table$FDR < 0.05))
## [1] 6890
The volcano plot shows the significantly differentially expressed genes in red.
# code adopted from http://www.nathalievialaneix.eu/doc/html/solution_edgeR-tomato.html#volcano-plot
volcanoData <- cbind(qlf_output_hits$table$logFC, -log10(qlf_output_hits$table$FDR))
colnames(volcanoData) <- c("logFC", "negLogPval")
DEGs <- qlf_output_hits$table$FDR < 0.05 & abs(qlf_output_hits$table$logFC) > 1
point.col <- ifelse(DEGs, "red", "black")
plot(volcanoData, pch = 16, col = point.col, cex = 0.5, main = "Volcano Plot")
# make column name shorter for better heat map visibility, "CM" stands for Cardiomyocytes
normalized_count_data_colnameChange <- normalized_count_data
colnames(normalized_count_data_colnameChange)[3] <- "CM_Mock_1"
colnames(normalized_count_data_colnameChange)[4] <- "CM_Mock_2"
colnames(normalized_count_data_colnameChange)[5] <- "CM_Mock_3"
colnames(normalized_count_data_colnameChange)[6] <- "CM_SARS-CoV2_1"
colnames(normalized_count_data_colnameChange)[7] <- "CM_SARS-CoV2_2"
colnames(normalized_count_data_colnameChange)[8] <- "CM_SARS-CoV2_3"
heatmap_matrix <- normalized_count_data_colnameChange[, 3:ncol(normalized_count_data_colnameChange)]
rownames(heatmap_matrix) <- normalized_count_data_colnameChange$gene_id
colnames(heatmap_matrix) <- colnames(normalized_count_data_colnameChange[,3:ncol(normalized_count_data_colnameChange)])
top_hits <- rownames(qlf_output_hits$table)[qlf_output_hits$table$PValue
<0.05]
heatmap_matrix_tophits <- t(scale(t(heatmap_matrix[which(rownames(heatmap_matrix) %in% top_hits),])))
if(min(heatmap_matrix_tophits) == 0){
heatmap_col = colorRamp2(c( 0, max(heatmap_matrix_tophits)),
c( "white", "red"))
} else {
heatmap_col = colorRamp2(c(min(heatmap_matrix_tophits),
0, max(heatmap_matrix_tophits)),
c("blue", "white", "red"))
}
current_heatmap <- Heatmap(as.matrix(heatmap_matrix_tophits),
show_row_dend = TRUE,
show_column_dend = TRUE,
col=heatmap_col,
show_column_names = TRUE,
show_row_names = FALSE,
show_heatmap_legend = TRUE,
)
# Show heat map
current_heatmap
See section 2.3. 7617 genes passed the p-value threshold of < 0.05 so they are significantly differentially expressed . I used 0.05 threshold as it is widely accepted cutoff for significance.
I used Benjamni-Hochberg correction for multiple hypothesis testing because it is not as stringent as the Bonferroni correction and also it is used in the edgeR package. 6890 genes passed correction.
See section 2.4.
See section 2.5. Yes, they do cluster together due to similarity.
# merge output hits with gene names
qlf_output_hits_withg = as.data.frame(qlf_output_hits)
qlf_output_hits_withg <- data.frame(names = row.names(qlf_output_hits_withg), qlf_output_hits_withg)
rownames(qlf_output_hits_withg ) <- c()
colnames(qlf_output_hits_withg)[1] <- "gene_id"
qlf_output_hits_withgn <- merge(dt.merged[,1:2],qlf_output_hits_withg)
qlf_output_hits_withgn <-qlf_output_hits_withgn[order(qlf_output_hits_withgn$PValue),]
# add rank column
qlf_output_hits_withgn[,"rank"] <- -log(qlf_output_hits_withgn$PValue, base = 10) * sign(qlf_output_hits_withgn$logFC)
qlf_output_hits_withgn <- qlf_output_hits_withgn[order(qlf_output_hits_withgn$rank),]
# P.S gaps exist in the file where there are no hgnc_symbols (NA) because they were not mapped.
upregulated_genes <- qlf_output_hits_withgn$hgnc_symbol[
which(qlf_output_hits_withgn$PValue < 0.05
& qlf_output_hits_withgn$logFC > 0)]
downregulated_genes <- qlf_output_hits_withgn$hgnc_symbol[
which(qlf_output_hits_withgn$PValue < 0.05
& qlf_output_hits_withgn$logFC < 0)]
write.table(x=upregulated_genes,
file=file.path("./","upregulated_genes.txt"),sep ="\t",
row.names = FALSE,col.names = FALSE,quote = FALSE)
write.table(x=downregulated_genes,
file=file.path("./","downregulated_genes.txt"),sep ="\t",
row.names = FALSE,col.names = FALSE,quote = FALSE)
write.table(x=data.frame(genename= qlf_output_hits_withgn$hgnc_symbol, F_stat = qlf_output_hits_withgn$rank),
file=file.path("./","ranked_genelist.txt"),sep ="\t",
row.names = FALSE,col.names = FALSE,quote = FALSE)
I used g:Profiler web server to perform the gene list enrichment analysis.
I ran g:Profiler 3 times: one for each list respectively (upregulated_genes, downregulated_genes, ranked_genelist)
Options configuration:
Advanced options:
No evidence codes
Significance threshold: Benjamini-Hochberg FDR
User threshold: 0.05
Data sources:
GO biological process
No electronic GO annotations
Reactome
WikiPathways
After query, term size was restricted to 200 genes.
I used g:Profiler because I am working with thresholded lists.
GO biological process release 2021-02-01.
Reactome version 75.
Wikipathways version 20210310
All queries are with 0.05 threshold and maximum term size of 200.
Upregulted genes:
Go results: 329 genesets.
Reactome results: 198 genesets
Wikipathways results: 63 genesets
Downregulted genes:
Go results: 37 genesets
Reactome results: 43 genesets
Wikipathways results: 26 genesets
All genes:
Go results: 1032 genesets
Reactome results: 581 genesets
Wikipathways results: 184 genesets
The results for all genes had a mix of genesets from both upregulated genes and downregulated genes results.
The main paper(Chen et al. 2020) mentions that SARS-CoV2 infected cells show downregulation of functional cardiomyocytes associated genes and upregulation of ROS related genes. They only gave a few genes as an example of both. My results show that the upregulated genes are mostly in DNA replication processes. According to this paper(Reczek and Chandel 2017) , as ROS level increases in cells, cell profileration increases up to a point then cell death occurs. This is consistent with the main paper and my results as ROS related genes are upregulated causing cell death when ROS reaches a high level. On the otherhand, my results show that the downregulated genes are mostly in metbalic processes like rRNA processing and cholestrol biosynthesis. This is consistent with the paper’s results as downregulation of genes associated with metabolism causes cell death. So overall, SARS-CoV2 infected cells are more susceptible to cell death.
Yes, this paper(Bojkova et al. 2020) also shows that SARS-CoV-2 infects and induces cytotoxic effects in human cardiomyocytes.
Bojkova, Denisa, Julian U G Wagner, Mariana Shumliakivska, Galip S Aslan, Umber Saleem, Arne Hansen, Guillermo Luxán, et al. 2020. “SARS-CoV-2 infects and induces cytotoxic effects in human cardiomyocytes.” Cardiovascular Research 116 (14): 2207–15. https://doi.org/10.1093/cvr/cvaa267.
Chen, Shuibing, Liuliu Yang, Benjamin Nilsson-Payant, Yuling Han, Fabrice Jaffré, Jiajun Zhu, Pengfei Wang, et al. 2020. “SARS-Cov-2 Infected Cardiomyocytes Recruit Monocytes by Secreting Ccl2.”
Reczek, Colleen R., and Navdeep S. Chandel. 2017. “The Two Faces of Reactive Oxygen Species in Cancer.” Annual Review of Cancer Biology 1 (1): 79–98. https://doi.org/10.1146/annurev-cancerbio-041916-065808.